import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
%matplotlib inline
from scipy.stats import zscore
pd.set_option("display.max_rows",None)
pd.set_option("display.max_columns",None)
from sklearn.preprocessing import StandardScaler
##let's read csv file and show the file of data
def read_show():
global data
data=pd.read_csv(input("Enter the URL"))
# read the csv file in jupyter notebook
return data.head()# return 5 row of dataset
read_show()
Enter the URLC:\Users\sunil\Desktop\ML project\e-com\online_shoppers_intention.csv
| Administrative | Administrative_Duration | Informational | Informational_Duration | ProductRelated | ProductRelated_Duration | BounceRates | ExitRates | PageValues | SpecialDay | Month | OperatingSystems | Browser | Region | TrafficType | VisitorType | Weekend | Revenue | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0.0 | 0 | 0.0 | 1 | 0.000000 | 0.20 | 0.20 | 0.0 | 0.0 | Feb | 1 | 1 | 1 | 1 | Returning_Visitor | False | False |
| 1 | 0 | 0.0 | 0 | 0.0 | 2 | 64.000000 | 0.00 | 0.10 | 0.0 | 0.0 | Feb | 2 | 2 | 1 | 2 | Returning_Visitor | False | False |
| 2 | 0 | 0.0 | 0 | 0.0 | 1 | 0.000000 | 0.20 | 0.20 | 0.0 | 0.0 | Feb | 4 | 1 | 9 | 3 | Returning_Visitor | False | False |
| 3 | 0 | 0.0 | 0 | 0.0 | 2 | 2.666667 | 0.05 | 0.14 | 0.0 | 0.0 | Feb | 3 | 2 | 2 | 4 | Returning_Visitor | False | False |
| 4 | 0 | 0.0 | 0 | 0.0 | 10 | 627.500000 | 0.02 | 0.05 | 0.0 | 0.0 | Feb | 3 | 3 | 1 | 4 | Returning_Visitor | True | False |
def data_understand(data):
print("\n","Name of the columns",data.columns)
#lets check out the name of collumns
print("\n","size of the detaset",data.shape)
#lets check out the no. of size of detaset
print("\n","understaning the detaset information",data.info())
# understanding info
return data.isnull().sum()
# lets check missing values in dataset
data_understand(data)
Name of the columns Index(['Administrative', 'Administrative_Duration', 'Informational',
'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',
'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 'Month',
'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType',
'Weekend', 'Revenue'],
dtype='object')
size of the detaset (12330, 18)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Administrative 12330 non-null int64
1 Administrative_Duration 12330 non-null float64
2 Informational 12330 non-null int64
3 Informational_Duration 12330 non-null float64
4 ProductRelated 12330 non-null int64
5 ProductRelated_Duration 12330 non-null float64
6 BounceRates 12330 non-null float64
7 ExitRates 12330 non-null float64
8 PageValues 12330 non-null float64
9 SpecialDay 12330 non-null float64
10 Month 12330 non-null object
11 OperatingSystems 12330 non-null int64
12 Browser 12330 non-null int64
13 Region 12330 non-null int64
14 TrafficType 12330 non-null int64
15 VisitorType 12330 non-null object
16 Weekend 12330 non-null bool
17 Revenue 12330 non-null bool
dtypes: bool(2), float64(7), int64(7), object(2)
memory usage: 1.5+ MB
understaning the detaset information None
Administrative 0 Administrative_Duration 0 Informational 0 Informational_Duration 0 ProductRelated 0 ProductRelated_Duration 0 BounceRates 0 ExitRates 0 PageValues 0 SpecialDay 0 Month 0 OperatingSystems 0 Browser 0 Region 0 TrafficType 0 VisitorType 0 Weekend 0 Revenue 0 dtype: int64
#Exploring dataset to findout which type of missing values are present
data.loc[7:100]
| Administrative | Administrative_Duration | Informational | Informational_Duration | ProductRelated | ProductRelated_Duration | BounceRates | ExitRates | PageValues | SpecialDay | Month | OperatingSystems | Browser | Region | TrafficType | VisitorType | Weekend | Revenue | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 7 | 1 | 0.000000 | 0 | 0.0 | 0 | 0.000000 | 0.200000 | 0.200000 | 0.000000 | 0.0 | Feb | 1 | 2 | 1 | 5 | Returning_Visitor | True | False |
| 8 | 0 | 0.000000 | 0 | 0.0 | 2 | 37.000000 | 0.000000 | 0.100000 | 0.000000 | 0.8 | Feb | 2 | 2 | 2 | 3 | Returning_Visitor | False | False |
| 9 | 0 | 0.000000 | 0 | 0.0 | 3 | 738.000000 | 0.000000 | 0.022222 | 0.000000 | 0.4 | Feb | 2 | 4 | 1 | 2 | Returning_Visitor | False | False |
| 10 | 0 | 0.000000 | 0 | 0.0 | 3 | 395.000000 | 0.000000 | 0.066667 | 0.000000 | 0.0 | Feb | 1 | 1 | 3 | 3 | Returning_Visitor | False | False |
| 11 | 0 | 0.000000 | 0 | 0.0 | 16 | 407.750000 | 0.018750 | 0.025833 | 0.000000 | 0.4 | Feb | 1 | 1 | 4 | 3 | Returning_Visitor | False | False |
| 12 | 0 | 0.000000 | 0 | 0.0 | 7 | 280.500000 | 0.000000 | 0.028571 | 0.000000 | 0.0 | Feb | 1 | 1 | 1 | 3 | Returning_Visitor | False | False |
| 13 | 0 | 0.000000 | 0 | 0.0 | 6 | 98.000000 | 0.000000 | 0.066667 | 0.000000 | 0.0 | Feb | 2 | 5 | 1 | 3 | Returning_Visitor | False | False |
| 14 | 0 | 0.000000 | 0 | 0.0 | 2 | 68.000000 | 0.000000 | 0.100000 | 0.000000 | 0.0 | Feb | 3 | 2 | 3 | 3 | Returning_Visitor | False | False |
| 15 | 2 | 53.000000 | 0 | 0.0 | 23 | 1668.285119 | 0.008333 | 0.016313 | 0.000000 | 0.0 | Feb | 1 | 1 | 9 | 3 | Returning_Visitor | False | False |
| 16 | 0 | 0.000000 | 0 | 0.0 | 1 | 0.000000 | 0.200000 | 0.200000 | 0.000000 | 0.0 | Feb | 1 | 1 | 4 | 3 | Returning_Visitor | False | False |
| 17 | 0 | 0.000000 | 0 | 0.0 | 13 | 334.966667 | 0.000000 | 0.007692 | 0.000000 | 0.0 | Feb | 1 | 1 | 1 | 4 | Returning_Visitor | True | False |
| 18 | 0 | 0.000000 | 0 | 0.0 | 2 | 32.000000 | 0.000000 | 0.100000 | 0.000000 | 0.0 | Feb | 2 | 2 | 1 | 3 | Returning_Visitor | False | False |
| 19 | 0 | 0.000000 | 0 | 0.0 | 20 | 2981.166667 | 0.000000 | 0.010000 | 0.000000 | 0.0 | Feb | 2 | 4 | 4 | 4 | Returning_Visitor | False | False |
| 20 | 0 | 0.000000 | 0 | 0.0 | 8 | 136.166667 | 0.000000 | 0.008333 | 0.000000 | 1.0 | Feb | 2 | 2 | 5 | 1 | Returning_Visitor | True | False |
| 21 | 0 | 0.000000 | 0 | 0.0 | 2 | 0.000000 | 0.200000 | 0.200000 | 0.000000 | 0.0 | Feb | 3 | 3 | 1 | 3 | Returning_Visitor | False | False |
| 22 | 0 | 0.000000 | 0 | 0.0 | 3 | 105.000000 | 0.000000 | 0.033333 | 0.000000 | 0.0 | Feb | 3 | 2 | 1 | 5 | Returning_Visitor | False | False |
| 23 | 0 | 0.000000 | 0 | 0.0 | 2 | 15.000000 | 0.000000 | 0.100000 | 0.000000 | 0.8 | Feb | 2 | 4 | 1 | 3 | Returning_Visitor | False | False |
| 24 | 0 | 0.000000 | 0 | 0.0 | 1 | 0.000000 | 0.200000 | 0.200000 | 0.000000 | 0.0 | Feb | 2 | 2 | 4 | 1 | Returning_Visitor | True | False |
| 25 | 0 | 0.000000 | 0 | 0.0 | 5 | 156.000000 | 0.000000 | 0.040000 | 0.000000 | 0.0 | Feb | 1 | 1 | 9 | 3 | Returning_Visitor | False | False |
| 26 | 4 | 64.600000 | 0 | 0.0 | 32 | 1135.444444 | 0.002857 | 0.009524 | 0.000000 | 0.0 | Feb | 2 | 2 | 1 | 3 | Returning_Visitor | False | False |
| 27 | 0 | 0.000000 | 0 | 0.0 | 4 | 76.000000 | 0.050000 | 0.100000 | 0.000000 | 0.0 | Feb | 1 | 1 | 1 | 3 | Returning_Visitor | False | False |
| 28 | 0 | 0.000000 | 0 | 0.0 | 4 | 63.000000 | 0.000000 | 0.050000 | 0.000000 | 0.2 | Feb | 2 | 6 | 1 | 3 | Returning_Visitor | False | False |
| 29 | 1 | 6.000000 | 1 | 0.0 | 45 | 1582.750000 | 0.043478 | 0.050821 | 54.179764 | 0.4 | Feb | 3 | 2 | 1 | 1 | Returning_Visitor | False | False |
| 30 | 0 | 0.000000 | 0 | 0.0 | 2 | 35.000000 | 0.000000 | 0.100000 | 0.000000 | 0.0 | Feb | 1 | 1 | 6 | 3 | Returning_Visitor | False | False |
| 31 | 0 | 0.000000 | 0 | 0.0 | 3 | 78.000000 | 0.000000 | 0.066667 | 0.000000 | 0.0 | Feb | 1 | 2 | 6 | 6 | Returning_Visitor | True | False |
| 32 | 0 | 0.000000 | 0 | 0.0 | 8 | 209.500000 | 0.000000 | 0.025000 | 0.000000 | 0.0 | Feb | 2 | 2 | 1 | 1 | Returning_Visitor | False | False |
| 33 | 0 | 0.000000 | 0 | 0.0 | 10 | 183.666667 | 0.040000 | 0.080000 | 0.000000 | 0.0 | Feb | 1 | 1 | 3 | 1 | Returning_Visitor | False | False |
| 34 | 0 | 0.000000 | 0 | 0.0 | 14 | 380.500000 | 0.014286 | 0.028571 | 0.000000 | 0.0 | Feb | 2 | 2 | 1 | 1 | Returning_Visitor | False | False |
| 35 | 0 | 0.000000 | 0 | 0.0 | 52 | 2086.242857 | 0.015385 | 0.020353 | 0.000000 | 0.0 | Feb | 2 | 2 | 7 | 1 | Returning_Visitor | False | False |
| 36 | 0 | 0.000000 | 0 | 0.0 | 8 | 388.000000 | 0.025000 | 0.056250 | 0.000000 | 0.0 | Feb | 3 | 2 | 1 | 4 | Returning_Visitor | True | False |
| 37 | 2 | 18.000000 | 0 | 0.0 | 5 | 298.000000 | 0.000000 | 0.028571 | 0.000000 | 0.8 | Feb | 2 | 2 | 8 | 4 | Returning_Visitor | False | False |
| 38 | 0 | 0.000000 | 0 | 0.0 | 7 | 63.000000 | 0.028571 | 0.071429 | 0.000000 | 0.6 | Feb | 2 | 2 | 1 | 3 | Returning_Visitor | False | False |
| 39 | 0 | 0.000000 | 0 | 0.0 | 9 | 482.000000 | 0.000000 | 0.022222 | 0.000000 | 0.0 | Feb | 2 | 5 | 1 | 6 | Returning_Visitor | False | False |
| 40 | 1 | 9.000000 | 0 | 0.0 | 46 | 4084.393939 | 0.000000 | 0.001795 | 0.000000 | 0.0 | Feb | 2 | 2 | 8 | 4 | Returning_Visitor | False | False |
| 41 | 0 | 0.000000 | 0 | 0.0 | 3 | 22.000000 | 0.000000 | 0.066667 | 0.000000 | 0.6 | Feb | 1 | 1 | 3 | 2 | Returning_Visitor | False | False |
| 42 | 0 | 0.000000 | 0 | 0.0 | 15 | 310.166667 | 0.000000 | 0.006667 | 0.000000 | 0.0 | Feb | 1 | 1 | 4 | 4 | Returning_Visitor | False | False |
| 43 | 0 | 0.000000 | 0 | 0.0 | 2 | 34.000000 | 0.000000 | 0.050000 | 0.000000 | 0.4 | Feb | 3 | 2 | 2 | 3 | Returning_Visitor | False | False |
| 44 | 0 | 0.000000 | 0 | 0.0 | 4 | 88.000000 | 0.000000 | 0.050000 | 0.000000 | 0.0 | Feb | 4 | 1 | 1 | 3 | Returning_Visitor | False | False |
| 45 | 0 | 0.000000 | 0 | 0.0 | 22 | 622.250000 | 0.003030 | 0.006061 | 0.000000 | 0.2 | Feb | 2 | 5 | 1 | 4 | Returning_Visitor | False | False |
| 46 | 0 | 0.000000 | 0 | 0.0 | 14 | 222.400000 | 0.017143 | 0.057143 | 0.000000 | 0.0 | Feb | 1 | 1 | 1 | 2 | Returning_Visitor | False | False |
| 47 | 0 | 0.000000 | 0 | 0.0 | 3 | 80.000000 | 0.066667 | 0.133333 | 0.000000 | 0.2 | Feb | 3 | 2 | 1 | 3 | Returning_Visitor | False | False |
| 48 | 0 | 0.000000 | 0 | 0.0 | 11 | 800.833333 | 0.000000 | 0.003636 | 0.000000 | 0.0 | Feb | 3 | 2 | 1 | 3 | Returning_Visitor | False | False |
| 49 | 0 | 0.000000 | 0 | 0.0 | 1 | 0.000000 | 0.200000 | 0.200000 | 0.000000 | 0.6 | Feb | 2 | 2 | 3 | 2 | Returning_Visitor | False | False |
| 50 | 0 | 0.000000 | 0 | 0.0 | 1 | 0.000000 | 0.200000 | 0.200000 | 0.000000 | 0.0 | Feb | 1 | 1 | 3 | 4 | Returning_Visitor | True | False |
| 51 | 0 | 0.000000 | 0 | 0.0 | 12 | 265.166667 | 0.011111 | 0.026111 | 0.000000 | 0.2 | Feb | 3 | 2 | 1 | 3 | Returning_Visitor | False | False |
| 52 | 0 | 0.000000 | 0 | 0.0 | 2 | 29.000000 | 0.000000 | 0.100000 | 0.000000 | 1.0 | Feb | 2 | 4 | 4 | 2 | Returning_Visitor | True | False |
| 53 | 0 | 0.000000 | 0 | 0.0 | 4 | 160.000000 | 0.000000 | 0.075000 | 0.000000 | 0.0 | Feb | 4 | 2 | 2 | 3 | Returning_Visitor | False | False |
| 54 | 0 | 0.000000 | 0 | 0.0 | 4 | 135.666667 | 0.050000 | 0.025000 | 0.000000 | 0.4 | Feb | 3 | 3 | 1 | 4 | Returning_Visitor | False | False |
| 55 | 0 | 0.000000 | 0 | 0.0 | 1 | 0.000000 | 0.200000 | 0.200000 | 0.000000 | 0.2 | Feb | 2 | 4 | 1 | 3 | Returning_Visitor | False | False |
| 56 | 0 | 0.000000 | 0 | 0.0 | 1 | 0.000000 | 0.200000 | 0.200000 | 0.000000 | 0.6 | Feb | 3 | 2 | 3 | 3 | Returning_Visitor | False | False |
| 57 | 4 | 56.000000 | 2 | 120.0 | 36 | 998.741667 | 0.000000 | 0.014736 | 19.447079 | 0.2 | Feb | 2 | 2 | 4 | 1 | Returning_Visitor | False | False |
| 58 | 0 | 0.000000 | 0 | 0.0 | 4 | 104.000000 | 0.000000 | 0.050000 | 0.000000 | 0.4 | Feb | 2 | 6 | 6 | 3 | Returning_Visitor | False | False |
| 59 | 2 | 16.000000 | 0 | 0.0 | 16 | 381.686508 | 0.011765 | 0.046569 | 0.000000 | 0.6 | Feb | 2 | 4 | 2 | 1 | Returning_Visitor | False | False |
| 60 | 0 | 0.000000 | 0 | 0.0 | 6 | 169.000000 | 0.000000 | 0.016667 | 0.000000 | 0.4 | Feb | 1 | 1 | 3 | 3 | Returning_Visitor | False | False |
| 61 | 0 | 0.000000 | 0 | 0.0 | 8 | 400.800000 | 0.050000 | 0.120833 | 0.000000 | 0.0 | Feb | 1 | 1 | 1 | 3 | Returning_Visitor | False | False |
| 62 | 12 | 279.416667 | 0 | 0.0 | 42 | 1553.583333 | 0.009000 | 0.019667 | 38.308493 | 0.0 | Feb | 1 | 1 | 3 | 2 | Returning_Visitor | False | False |
| 63 | 0 | 0.000000 | 0 | 0.0 | 14 | 706.500000 | 0.000000 | 0.007143 | 0.000000 | 0.0 | Feb | 2 | 2 | 5 | 4 | Returning_Visitor | True | False |
| 64 | 0 | 0.000000 | 0 | 0.0 | 1 | 0.000000 | 0.200000 | 0.200000 | 0.000000 | 0.0 | Feb | 2 | 2 | 4 | 3 | Returning_Visitor | False | False |
| 65 | 3 | 87.833333 | 0 | 0.0 | 27 | 798.333333 | 0.000000 | 0.012644 | 22.916036 | 0.8 | Feb | 2 | 2 | 3 | 1 | Returning_Visitor | False | True |
| 66 | 4 | 44.000000 | 0 | 0.0 | 90 | 6951.972222 | 0.002151 | 0.015013 | 0.000000 | 0.0 | Feb | 4 | 1 | 1 | 3 | Returning_Visitor | False | False |
| 67 | 0 | 0.000000 | 0 | 0.0 | 1 | 0.000000 | 0.200000 | 0.200000 | 0.000000 | 0.6 | Feb | 2 | 2 | 5 | 1 | Returning_Visitor | False | False |
| 68 | 0 | 0.000000 | 0 | 0.0 | 18 | 902.000000 | 0.000000 | 0.007407 | 0.000000 | 0.0 | Feb | 2 | 7 | 2 | 4 | Returning_Visitor | False | False |
| 69 | 0 | 0.000000 | 0 | 0.0 | 1 | 0.000000 | 0.200000 | 0.200000 | 0.000000 | 0.2 | Feb | 3 | 2 | 3 | 3 | Returning_Visitor | False | False |
| 70 | 0 | 0.000000 | 0 | 0.0 | 1 | 0.000000 | 0.200000 | 0.200000 | 0.000000 | 0.0 | Feb | 2 | 6 | 1 | 2 | Returning_Visitor | False | False |
| 71 | 0 | 0.000000 | 0 | 0.0 | 20 | 197.377778 | 0.025000 | 0.052500 | 0.000000 | 0.0 | Feb | 2 | 6 | 6 | 1 | Returning_Visitor | False | False |
| 72 | 0 | 0.000000 | 0 | 0.0 | 2 | 43.000000 | 0.000000 | 0.100000 | 0.000000 | 0.4 | Feb | 2 | 2 | 3 | 3 | Returning_Visitor | False | False |
| 73 | 0 | 0.000000 | 0 | 0.0 | 8 | 426.666667 | 0.000000 | 0.012500 | 0.000000 | 0.0 | Feb | 2 | 4 | 3 | 2 | Returning_Visitor | False | False |
| 74 | 0 | 0.000000 | 0 | 0.0 | 3 | 135.000000 | 0.000000 | 0.066667 | 0.000000 | 0.0 | Feb | 2 | 4 | 3 | 3 | Returning_Visitor | False | False |
| 75 | 0 | 0.000000 | 0 | 0.0 | 16 | 588.333333 | 0.000000 | 0.025000 | 0.000000 | 0.0 | Feb | 2 | 4 | 1 | 1 | Returning_Visitor | False | False |
| 76 | 10 | 1005.666667 | 0 | 0.0 | 36 | 2111.341667 | 0.004348 | 0.014493 | 11.439412 | 0.0 | Feb | 2 | 6 | 1 | 2 | Returning_Visitor | False | True |
| 77 | 0 | 0.000000 | 0 | 0.0 | 2 | 76.000000 | 0.000000 | 0.050000 | 0.000000 | 0.6 | Feb | 3 | 2 | 3 | 3 | Returning_Visitor | False | False |
| 78 | 0 | 0.000000 | 0 | 0.0 | 1 | 0.000000 | 0.200000 | 0.200000 | 0.000000 | 1.0 | Feb | 1 | 1 | 1 | 3 | Returning_Visitor | True | False |
| 79 | 0 | 0.000000 | 0 | 0.0 | 1 | 0.000000 | 0.200000 | 0.200000 | 0.000000 | 0.4 | Feb | 1 | 1 | 1 | 3 | Returning_Visitor | False | False |
| 80 | 0 | 0.000000 | 0 | 0.0 | 7 | 208.000000 | 0.000000 | 0.028571 | 0.000000 | 0.0 | Feb | 4 | 1 | 1 | 5 | Returning_Visitor | True | False |
| 81 | 0 | 0.000000 | 0 | 0.0 | 4 | 270.000000 | 0.000000 | 0.016667 | 0.000000 | 0.8 | Feb | 1 | 1 | 1 | 3 | Returning_Visitor | False | False |
| 82 | 0 | 0.000000 | 0 | 0.0 | 6 | 39.500000 | 0.000000 | 0.020000 | 0.000000 | 0.0 | Feb | 2 | 2 | 1 | 3 | Returning_Visitor | False | False |
| 83 | 0 | 0.000000 | 0 | 0.0 | 12 | 375.000000 | 0.016667 | 0.058333 | 0.000000 | 0.0 | Feb | 3 | 2 | 4 | 1 | Returning_Visitor | False | False |
| 84 | 0 | 0.000000 | 0 | 0.0 | 1 | 0.000000 | 0.200000 | 0.200000 | 0.000000 | 0.8 | Feb | 2 | 2 | 2 | 1 | Returning_Visitor | True | False |
| 85 | 0 | 0.000000 | 0 | 0.0 | 1 | 0.000000 | 0.200000 | 0.200000 | 0.000000 | 0.0 | Feb | 1 | 1 | 1 | 3 | Returning_Visitor | False | False |
| 86 | 0 | 0.000000 | 0 | 0.0 | 7 | 150.000000 | 0.057143 | 0.085714 | 0.000000 | 0.0 | Feb | 2 | 2 | 2 | 1 | Returning_Visitor | False | False |
| 87 | 0 | 0.000000 | 0 | 0.0 | 3 | 138.000000 | 0.000000 | 0.066667 | 0.000000 | 0.0 | Feb | 1 | 1 | 1 | 3 | Returning_Visitor | False | False |
| 88 | 0 | 0.000000 | 0 | 0.0 | 7 | 337.500000 | 0.028571 | 0.023810 | 0.000000 | 0.4 | Feb | 4 | 1 | 3 | 3 | Returning_Visitor | False | False |
| 89 | 0 | 0.000000 | 0 | 0.0 | 19 | 620.033333 | 0.000000 | 0.007895 | 0.000000 | 0.0 | Feb | 1 | 1 | 4 | 2 | Returning_Visitor | False | False |
| 90 | 2 | 36.000000 | 0 | 0.0 | 15 | 168.846154 | 0.000000 | 0.011765 | 0.000000 | 0.4 | Feb | 1 | 1 | 1 | 3 | Returning_Visitor | False | False |
| 91 | 0 | 0.000000 | 0 | 0.0 | 1 | 0.000000 | 0.200000 | 0.200000 | 0.000000 | 0.0 | Feb | 1 | 1 | 1 | 2 | Returning_Visitor | True | False |
| 92 | 0 | 0.000000 | 0 | 0.0 | 2 | 52.000000 | 0.000000 | 0.100000 | 0.000000 | 0.0 | Feb | 1 | 1 | 1 | 3 | Returning_Visitor | False | False |
| 93 | 0 | 0.000000 | 0 | 0.0 | 13 | 649.250000 | 0.000000 | 0.015385 | 0.000000 | 0.0 | Feb | 2 | 2 | 1 | 5 | New_Visitor | False | False |
| 94 | 0 | 0.000000 | 0 | 0.0 | 27 | 925.333333 | 0.003704 | 0.025926 | 0.000000 | 0.6 | Feb | 4 | 1 | 3 | 3 | Returning_Visitor | False | False |
| 95 | 0 | 0.000000 | 0 | 0.0 | 2 | 33.000000 | 0.000000 | 0.100000 | 0.000000 | 0.2 | Feb | 1 | 1 | 1 | 3 | Returning_Visitor | False | False |
| 96 | 0 | 0.000000 | 0 | 0.0 | 6 | 1566.500000 | 0.050000 | 0.066667 | 0.000000 | 0.2 | Feb | 1 | 1 | 1 | 3 | Returning_Visitor | False | False |
| 97 | 0 | 0.000000 | 0 | 0.0 | 4 | 105.000000 | 0.000000 | 0.025000 | 0.000000 | 0.6 | Feb | 1 | 1 | 1 | 4 | Returning_Visitor | False | False |
| 98 | 0 | 0.000000 | 1 | 0.0 | 7 | 50.000000 | 0.038095 | 0.080952 | 0.000000 | 0.6 | Feb | 2 | 4 | 1 | 7 | Returning_Visitor | False | False |
| 99 | 0 | 0.000000 | 0 | 0.0 | 16 | 644.200000 | 0.004167 | 0.031667 | 0.000000 | 0.0 | Feb | 3 | 2 | 3 | 4 | Returning_Visitor | False | False |
| 100 | 3 | 18.333333 | 0 | 0.0 | 38 | 2635.177778 | 0.000000 | 0.008947 | 0.000000 | 0.4 | Feb | 2 | 4 | 1 | 2 | Returning_Visitor | False | False |
#copy orignal data into anathor name
import copy
df_copy=copy.deepcopy(data)
#finding which type of unique values have present
df_copy['Month'].unique()
array(['Feb', 'Mar', 'May', 'Oct', 'June', 'Jul', 'Aug', 'Nov', 'Sep',
'Dec'], dtype=object)
df_copy['VisitorType'].unique()
array(['Returning_Visitor', 'New_Visitor', 'Other'], dtype=object)
df_copy.shape
(12330, 18)
##convert non numeric to numeric
from sklearn.preprocessing import LabelEncoder
df_copy.loc[:,['Month', 'VisitorType', 'Weekend','Revenue']] = df_copy.loc[:,['Month', 'VisitorType', 'Weekend','Revenue']].apply(LabelEncoder().fit_transform)
df_copy.head()
| Administrative | Administrative_Duration | Informational | Informational_Duration | ProductRelated | ProductRelated_Duration | BounceRates | ExitRates | PageValues | SpecialDay | Month | OperatingSystems | Browser | Region | TrafficType | VisitorType | Weekend | Revenue | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0.0 | 0 | 0.0 | 1 | 0.000000 | 0.20 | 0.20 | 0.0 | 0.0 | 2 | 1 | 1 | 1 | 1 | 2 | 0 | 0 |
| 1 | 0 | 0.0 | 0 | 0.0 | 2 | 64.000000 | 0.00 | 0.10 | 0.0 | 0.0 | 2 | 2 | 2 | 1 | 2 | 2 | 0 | 0 |
| 2 | 0 | 0.0 | 0 | 0.0 | 1 | 0.000000 | 0.20 | 0.20 | 0.0 | 0.0 | 2 | 4 | 1 | 9 | 3 | 2 | 0 | 0 |
| 3 | 0 | 0.0 | 0 | 0.0 | 2 | 2.666667 | 0.05 | 0.14 | 0.0 | 0.0 | 2 | 3 | 2 | 2 | 4 | 2 | 0 | 0 |
| 4 | 0 | 0.0 | 0 | 0.0 | 10 | 627.500000 | 0.02 | 0.05 | 0.0 | 0.0 | 2 | 3 | 3 | 1 | 4 | 2 | 1 | 0 |
# go for discriptive statestics
df_copy.describe().transpose()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Administrative | 12330.0 | 2.315166 | 3.321784 | 0.0 | 0.000000 | 1.000000 | 4.000000 | 27.000000 |
| Administrative_Duration | 12330.0 | 80.818611 | 176.779107 | 0.0 | 0.000000 | 7.500000 | 93.256250 | 3398.750000 |
| Informational | 12330.0 | 0.503569 | 1.270156 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 24.000000 |
| Informational_Duration | 12330.0 | 34.472398 | 140.749294 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 2549.375000 |
| ProductRelated | 12330.0 | 31.731468 | 44.475503 | 0.0 | 7.000000 | 18.000000 | 38.000000 | 705.000000 |
| ProductRelated_Duration | 12330.0 | 1194.746220 | 1913.669288 | 0.0 | 184.137500 | 598.936905 | 1464.157214 | 63973.522230 |
| BounceRates | 12330.0 | 0.022191 | 0.048488 | 0.0 | 0.000000 | 0.003112 | 0.016813 | 0.200000 |
| ExitRates | 12330.0 | 0.043073 | 0.048597 | 0.0 | 0.014286 | 0.025156 | 0.050000 | 0.200000 |
| PageValues | 12330.0 | 5.889258 | 18.568437 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 361.763742 |
| SpecialDay | 12330.0 | 0.061427 | 0.198917 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| Month | 12330.0 | 5.163990 | 2.370199 | 0.0 | 5.000000 | 6.000000 | 7.000000 | 9.000000 |
| OperatingSystems | 12330.0 | 2.124006 | 0.911325 | 1.0 | 2.000000 | 2.000000 | 3.000000 | 8.000000 |
| Browser | 12330.0 | 2.357097 | 1.717277 | 1.0 | 2.000000 | 2.000000 | 2.000000 | 13.000000 |
| Region | 12330.0 | 3.147364 | 2.401591 | 1.0 | 1.000000 | 3.000000 | 4.000000 | 9.000000 |
| TrafficType | 12330.0 | 4.069586 | 4.025169 | 1.0 | 2.000000 | 2.000000 | 4.000000 | 20.000000 |
| VisitorType | 12330.0 | 1.718329 | 0.690759 | 0.0 | 2.000000 | 2.000000 | 2.000000 | 2.000000 |
| Weekend | 12330.0 | 0.232603 | 0.422509 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| Revenue | 12330.0 | 0.154745 | 0.361676 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
# working on outlier
df_copy.skew()
Administrative 1.960357 Administrative_Duration 5.615719 Informational 4.036464 Informational_Duration 7.579185 ProductRelated 4.341516 ProductRelated_Duration 7.263228 BounceRates 2.947855 ExitRates 2.148789 PageValues 6.382964 SpecialDay 3.302667 Month -0.832535 OperatingSystems 2.066285 Browser 3.242350 Region 0.983549 TrafficType 1.962987 VisitorType -2.065135 Weekend 1.265962 Revenue 1.909509 dtype: float64
df_copy.kurtosis()
Administrative 4.701146 Administrative_Duration 50.556739 Informational 26.932266 Informational_Duration 76.316853 ProductRelated 31.211707 ProductRelated_Duration 137.174164 BounceRates 7.723159 ExitRates 4.017035 PageValues 65.635694 SpecialDay 9.913659 Month -0.368330 OperatingSystems 10.456843 Browser 12.746733 Region -0.148680 TrafficType 3.479711 VisitorType 2.295013 Weekend -0.397404 Revenue 1.646493 dtype: float64
# finding outlier using box-plot technique
for i in df_copy.columns:
fig = px.box(df_copy, y=i, width=600, height=400, title=i , template="plotly_dark")
fig.show()
# Treatment on extream values using zscore
def multi_tretment_outlier(df_copy):
global data_z
print("List of outliers")
data_z= df_copy.apply(zscore) # find outlier using zscore
print(np.where(data_z > 3))
feature_columns = df_copy.columns
for i in feature_columns: #tretment of outliers
df_copy[i] = np.where(df_copy[i] > (df_copy[i].quantile(0.75) + (df_copy[i].quantile(0.75) - df_copy[i].quantile(0.25))*1.5),
(df_copy[i].quantile(0.75) + (df_copy[i].quantile(0.75) - df_copy[i].quantile(0.25))*1.5),
np.where(df_copy[i] < (df_copy[i].quantile(0.25) - (df_copy[i].quantile(0.75) - df_copy[i].quantile(0.25))*1.5),
(df_copy[i].quantile(0.25) - (df_copy[i].quantile(0.75) - df_copy[i].quantile(0.25))*1.5),df_copy[i]))
for i in feature_columns:
fig = px.box(df_copy , y= i, width=600, height=400, title=i, template="plotly_dark")
fig.show()
global data_z_1
print(" Resolve outliers")
data_z_1= df_copy.apply(zscore)
print(np.where(data_z_1 > 18)) ## treatment on extream value using z_score
multi_tretment_outlier(df_copy)
List of outliers (array([ 0, 0, 2, ..., 12321, 12321, 12321], dtype=int64), array([ 6, 7, 6, ..., 6, 7, 12], dtype=int64))
Resolve outliers (array([], dtype=int64), array([], dtype=int64))
sns.pairplot(df_copy, diag_kind = "kde")
<seaborn.axisgrid.PairGrid at 0x2ba913ad730>
sns.heatmap(df_copy.corr(method = "pearson"), annot = True, cmap="YlGnBu", linewidths=.5)
<AxesSubplot:>
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
#scale dataset using StandardScaler method
scale_1 = StandardScaler()
train_scale = scale_1.fit_transform(df_copy)
train_scale[:5]
array([[-0.75184584, -0.71892194, 0. , 0. , -1.02151789,
-0.95800238, 2.09099881, 2.19976889, 0. , 0. ,
-1.69524807, -1.39545325, 0. , -0.90396852, -1.0640987 ,
0. , 0. , 0. ],
[-0.75184584, -0.71892194, 0. , 0. , -0.98176428,
-0.8960516 , -0.73473346, 2.0827199 , 0. , 0. ,
-1.69524807, -0.12409414, 0. , -0.90396852, -0.58402783,
0. , 0. , 0. ],
[-0.75184584, -0.71892194, 0. , 0. , -1.02151789,
-0.95800238, 2.09099881, 2.19976889, 0. , 0. ,
-1.69524807, 2.4186241 , 0. , 2.28404472, -0.10395695,
0. , 0. , 0. ],
[-0.75184584, -0.71892194, 0. , 0. , -0.98176428,
-0.9554211 , 2.09099881, 2.19976889, 0. , 0. ,
-1.69524807, 1.14726498, 0. , -0.47890009, 0.37611392,
0. , 0. , 0. ],
[-0.75184584, -0.71892194, 0. , 0. , -0.66373539,
-0.35059439, 0.60984822, 0.44403433, 0. , 0. ,
-1.69524807, 1.14726498, 0. , -0.90396852, 0.37611392,
0. , 0. , 0. ]])
from sklearn.decomposition import PCA
# Reduce dimention using dimention technique
def dim_fn(train_scale, method):
global train_dim
reduce = method
train_dim = reduce.fit_transform(train_scale)
var_train = np.var(train_dim, axis = 0)
var_train_ratio = var_train / np.sum(var_train)
print("varance ratio for train : ", var_train_ratio)
return
dim_fn(train_scale, PCA(n_components = 4))
varance ratio for train : [0.48146673 0.18950325 0.166958 0.16207202]
from sklearn.decomposition import FactorAnalysis
dim_fn(train_scale, FactorAnalysis(n_components = 4))
varance ratio for train : [0.29914582 0.28898208 0.25808474 0.15378736]
from sklearn.decomposition import KernelPCA
dim_fn(train_scale, KernelPCA(n_components = 4, kernel = 'rbf'))
varance ratio for train : [0.37167375 0.28562346 0.17158213 0.17112066]
dim_fn(train_scale, KernelPCA(n_components = 4, kernel = 'polynomial'))
varance ratio for train : [0.46664511 0.23428113 0.1550406 0.14403316]
dim_fn(train_scale, KernelPCA(n_components = 4, kernel = 'sigmoid'))
varance ratio for train : [0.48885254 0.18240123 0.1686468 0.16009943]
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
# doing clustering use KMeans clustering method ilbo_method
wss = []
for i in range(1,10):
KM = KMeans(n_clusters = i)
KM.fit(train_dim)
wss.append(KM.inertia_)
plt.plot(range(1,10), wss)
[<matplotlib.lines.Line2D at 0x2baac73cdf0>]
# clustering using silhouett score
for i in range(2,10):
k_means = KMeans(n_clusters = i, random_state = 15)
k_means.fit(train_dim)
labels = k_means.labels_
print("Silhouette Score for ", i, " Clusters Solutions : ", silhouette_score(train_dim, labels))
Silhouette Score for 2 Clusters Solutions : 0.27601639167372266 Silhouette Score for 3 Clusters Solutions : 0.22089542902635761 Silhouette Score for 4 Clusters Solutions : 0.20878465012364017 Silhouette Score for 5 Clusters Solutions : 0.20237799361669923 Silhouette Score for 6 Clusters Solutions : 0.19414603075255302 Silhouette Score for 7 Clusters Solutions : 0.1944436104499032 Silhouette Score for 8 Clusters Solutions : 0.19426957032900707 Silhouette Score for 9 Clusters Solutions : 0.1941956347050781
k_means_final = KMeans(n_clusters = 2, random_state = 98)
k_means_final.fit(train_dim)
labels_4 = k_means_final.labels_
labels_4[:5]
array([0, 0, 0, 0, 0])
train_cluster_kmeans = copy.deepcopy(df_copy)
train_cluster_kmeans['Label'] = labels_4
train_cluster_kmeans.head()
| Administrative | Administrative_Duration | Informational | Informational_Duration | ProductRelated | ProductRelated_Duration | BounceRates | ExitRates | PageValues | SpecialDay | Month | OperatingSystems | Browser | Region | TrafficType | VisitorType | Weekend | Revenue | Label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.000000 | 0.042031 | 0.103571 | 0.0 | 0.0 | 2.0 | 1.0 | 2.0 | 1.0 | 1.0 | 2.0 | 0.0 | 0.0 | 0 |
| 1 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 64.000000 | 0.000000 | 0.100000 | 0.0 | 0.0 | 2.0 | 2.0 | 2.0 | 1.0 | 2.0 | 2.0 | 0.0 | 0.0 | 0 |
| 2 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.000000 | 0.042031 | 0.103571 | 0.0 | 0.0 | 2.0 | 4.0 | 2.0 | 8.5 | 3.0 | 2.0 | 0.0 | 0.0 | 0 |
| 3 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 2.666667 | 0.042031 | 0.103571 | 0.0 | 0.0 | 2.0 | 3.0 | 2.0 | 2.0 | 4.0 | 2.0 | 0.0 | 0.0 | 0 |
| 4 | 0.0 | 0.0 | 0.0 | 0.0 | 10.0 | 627.500000 | 0.020000 | 0.050000 | 0.0 | 0.0 | 2.0 | 3.0 | 2.0 | 1.0 | 4.0 | 2.0 | 0.0 | 0.0 | 0 |
train_cluster_kmeans.groupby('Label').mean()
| Administrative | Administrative_Duration | Informational | Informational_Duration | ProductRelated | ProductRelated_Duration | BounceRates | ExitRates | PageValues | SpecialDay | Month | OperatingSystems | Browser | Region | TrafficType | VisitorType | Weekend | Revenue | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Label | ||||||||||||||||||
| 0 | 0.541492 | 12.549136 | 0.0 | 0.0 | 12.090056 | 416.374291 | 0.015597 | 0.050725 | 0.0 | 0.0 | 5.159763 | 2.099654 | 2.0 | 3.155650 | 3.337711 | 2.0 | 0.0 | 0.0 |
| 1 | 4.324940 | 113.964334 | 0.0 | 0.0 | 45.434642 | 1725.206104 | 0.004940 | 0.018140 | 0.0 | 0.0 | 5.649509 | 2.094982 | 2.0 | 3.089428 | 3.061100 | 2.0 | 0.0 | 0.0 |
k_means_3 = KMeans(n_clusters = 2, random_state = 98)
k_means_3.fit(train_dim)
labels_3 = k_means_3.labels_
train_cluster_kmeans['Label_3'] = labels_3
train_cluster_kmeans.groupby('Label_3').mean()
| Administrative | Administrative_Duration | Informational | Informational_Duration | ProductRelated | ProductRelated_Duration | BounceRates | ExitRates | PageValues | SpecialDay | Month | OperatingSystems | Browser | Region | TrafficType | VisitorType | Weekend | Revenue | Label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Label_3 | |||||||||||||||||||
| 0 | 0.541492 | 12.549136 | 0.0 | 0.0 | 12.090056 | 416.374291 | 0.015597 | 0.050725 | 0.0 | 0.0 | 5.159763 | 2.099654 | 2.0 | 3.155650 | 3.337711 | 2.0 | 0.0 | 0.0 | 0 |
| 1 | 4.324940 | 113.964334 | 0.0 | 0.0 | 45.434642 | 1725.206104 | 0.004940 | 0.018140 | 0.0 | 0.0 | 5.649509 | 2.094982 | 2.0 | 3.089428 | 3.061100 | 2.0 | 0.0 | 0.0 | 1 |
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors(n_neighbors=3)
nbrs = neigh.fit(train_dim)
distances, indices = nbrs.kneighbors(train_dim)
distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.plot(distances)
[<matplotlib.lines.Line2D at 0x2baba989880>]
for i in range(8, 26):
eps = i / 100
dbscan = DBSCAN(eps = eps)
model = dbscan.fit(train_dim)
labels = model.labels_
print(eps, '\t', silhouette_score(train_dim, labels))
0.08 -0.3228625100473922 0.09 -0.18179212633717604 0.1 0.05970306554066628 0.11 0.1931054593385504 0.12 0.19178979948700214 0.13 0.26388895890813047 0.14 0.2639010558365958 0.15 0.2801992408445043 0.16 0.28982925621357614 0.17 0.297784942148711 0.18 0.3273616359266776 0.19 0.36578851092876385 0.2 0.42071047093121916 0.21 0.42071047093121916 0.22 0.42071047093121916 0.23 0.42071047093121916 0.24 0.42071047093121916 0.25 0.42071047093121916
dbscan = DBSCAN(eps = 0.2)
final_dbscan = dbscan.fit(train_dim)
db_labels = final_dbscan.labels_
train_cluster_kmeans['Label_DBSCAN'] = db_labels
train_cluster_kmeans.groupby('Label_DBSCAN').mean()
| Administrative | Administrative_Duration | Informational | Informational_Duration | ProductRelated | ProductRelated_Duration | BounceRates | ExitRates | PageValues | SpecialDay | Month | OperatingSystems | Browser | Region | TrafficType | VisitorType | Weekend | Revenue | Label | Label_3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Label_DBSCAN | ||||||||||||||||||||
| -1 | 0.000000 | 0.000000 | 0.0 | 0.0 | 84.50000 | 3384.186784 | 0.042031 | 0.066159 | 0.0 | 0.0 | 4.000000 | 4.500000 | 2.0 | 8.500000 | 5.0000 | 2.0 | 0.0 | 0.0 | 1.000000 | 1.000000 |
| 0 | 2.198962 | 56.977397 | 0.0 | 0.0 | 26.69154 | 989.497238 | 0.010926 | 0.036449 | 0.0 | 0.0 | 5.374402 | 2.097413 | 2.0 | 3.126207 | 3.2164 | 2.0 | 0.0 | 0.0 | 0.437992 | 0.437992 |
train_cluster_kmeans[train_cluster_kmeans['Label_DBSCAN'] == -1]
| Administrative | Administrative_Duration | Informational | Informational_Duration | ProductRelated | ProductRelated_Duration | BounceRates | ExitRates | PageValues | SpecialDay | Month | OperatingSystems | Browser | Region | TrafficType | VisitorType | Weekend | Revenue | Label | Label_3 | Label_DBSCAN | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5679 | 0.0 | 0.0 | 0.0 | 0.0 | 84.5 | 3384.186784 | 0.042031 | 0.066159 | 0.0 | 0.0 | 4.0 | 4.5 | 2.0 | 8.5 | 5.0 | 2.0 | 0.0 | 0.0 | 1 | 1 | -1 |
data_anomaly = train_cluster_kmeans[train_cluster_kmeans['Label_DBSCAN'] == -1]
data_ok = train_cluster_kmeans[train_cluster_kmeans['Label_DBSCAN'] != -1]
data_ok.groupby('Label_3').mean()
| Administrative | Administrative_Duration | Informational | Informational_Duration | ProductRelated | ProductRelated_Duration | BounceRates | ExitRates | PageValues | SpecialDay | Month | OperatingSystems | Browser | Region | TrafficType | VisitorType | Weekend | Revenue | Label | Label_DBSCAN | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Label_3 | ||||||||||||||||||||
| 0 | 0.541492 | 12.549136 | 0.0 | 0.0 | 12.090056 | 416.374291 | 0.015597 | 0.050725 | 0.0 | 0.0 | 5.159763 | 2.099654 | 2.0 | 3.155650 | 3.337711 | 2.0 | 0.0 | 0.0 | 0 | 0 |
| 1 | 4.325741 | 113.985438 | 0.0 | 0.0 | 45.427407 | 1724.898885 | 0.004933 | 0.018131 | 0.0 | 0.0 | 5.649815 | 2.094537 | 2.0 | 3.088426 | 3.060741 | 2.0 | 0.0 | 0.0 | 1 | 0 |
data_anomaly.groupby('Label_3').mean()
| Administrative | Administrative_Duration | Informational | Informational_Duration | ProductRelated | ProductRelated_Duration | BounceRates | ExitRates | PageValues | SpecialDay | Month | OperatingSystems | Browser | Region | TrafficType | VisitorType | Weekend | Revenue | Label | Label_DBSCAN | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Label_3 | ||||||||||||||||||||
| 1 | 0.0 | 0.0 | 0.0 | 0.0 | 84.5 | 3384.186784 | 0.042031 | 0.066159 | 0.0 | 0.0 | 4.0 | 4.5 | 2.0 | 8.5 | 5.0 | 2.0 | 0.0 | 0.0 | 1 | -1 |